1 package org.apache.lucene.search;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 import java.io.IOException;
21 import java.util.ArrayList;
22 import java.util.Arrays;
23 import java.util.Comparator;
24 import java.util.HashMap;
25 import java.util.HashSet;
26 import java.util.LinkedHashMap;
27
28 import org.apache.lucene.index.Term;
29 import org.apache.lucene.search.similarities.Similarity;
30 import org.apache.lucene.util.FixedBitSet;
31
32 final class SloppyPhraseScorer extends Scorer {
33
34 private final ConjunctionDISI conjunction;
35 private final PhrasePositions[] phrasePositions;
36
37 private float sloppyFreq;
38
39 private final Similarity.SimScorer docScorer;
40
41 private final int slop;
42 private final int numPostings;
43 private final PhraseQueue pq;
44
45 private int end;
46
47 private boolean hasRpts;
48 private boolean checkedRpts;
49 private boolean hasMultiTermRpts;
50 private PhrasePositions[][] rptGroups;
51 private PhrasePositions[] rptStack;
52
53 private int numMatches;
54 final boolean needsScores;
55 private final float matchCost;
56
57 SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
58 int slop, Similarity.SimScorer docScorer, boolean needsScores,
59 float matchCost) {
60 super(weight);
61 this.docScorer = docScorer;
62 this.needsScores = needsScores;
63 this.slop = slop;
64 this.numPostings = postings==null ? 0 : postings.length;
65 pq = new PhraseQueue(postings.length);
66 DocIdSetIterator[] iterators = new DocIdSetIterator[postings.length];
67 phrasePositions = new PhrasePositions[postings.length];
68 for (int i = 0; i < postings.length; ++i) {
69 iterators[i] = postings[i].postings;
70 phrasePositions[i] = new PhrasePositions(postings[i].postings, postings[i].position, i, postings[i].terms);
71 }
72 conjunction = ConjunctionDISI.intersect(Arrays.asList(iterators));
73 this.matchCost = matchCost;
74 }
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94 private float phraseFreq() throws IOException {
95 if (!initPhrasePositions()) {
96 return 0.0f;
97 }
98 float freq = 0.0f;
99 numMatches = 0;
100 PhrasePositions pp = pq.pop();
101 int matchLength = end - pp.position;
102 int next = pq.top().position;
103 while (advancePP(pp)) {
104 if (hasRpts && !advanceRpts(pp)) {
105 break;
106 }
107 if (pp.position > next) {
108 if (matchLength <= slop) {
109 freq += docScorer.computeSlopFactor(matchLength);
110 numMatches++;
111 if (!needsScores) {
112 return freq;
113 }
114 }
115 pq.add(pp);
116 pp = pq.pop();
117 next = pq.top().position;
118 matchLength = end - pp.position;
119 } else {
120 int matchLength2 = end - pp.position;
121 if (matchLength2 < matchLength) {
122 matchLength = matchLength2;
123 }
124 }
125 }
126 if (matchLength <= slop) {
127 freq += docScorer.computeSlopFactor(matchLength);
128 numMatches++;
129 }
130 return freq;
131 }
132
133
134 private boolean advancePP(PhrasePositions pp) throws IOException {
135 if (!pp.nextPosition()) {
136 return false;
137 }
138 if (pp.position > end) {
139 end = pp.position;
140 }
141 return true;
142 }
143
144
145
146
147 private boolean advanceRpts(PhrasePositions pp) throws IOException {
148 if (pp.rptGroup < 0) {
149 return true;
150 }
151 PhrasePositions[] rg = rptGroups[pp.rptGroup];
152 FixedBitSet bits = new FixedBitSet(rg.length);
153 int k0 = pp.rptInd;
154 int k;
155 while((k=collide(pp)) >= 0) {
156 pp = lesser(pp, rg[k]);
157 if (!advancePP(pp)) {
158 return false;
159 }
160 if (k != k0) {
161 bits = FixedBitSet.ensureCapacity(bits, k);
162 bits.set(k);
163 }
164 }
165
166
167 int n = 0;
168
169 int numBits = bits.length();
170 while (bits.cardinality() > 0) {
171 PhrasePositions pp2 = pq.pop();
172 rptStack[n++] = pp2;
173 if (pp2.rptGroup >= 0
174 && pp2.rptInd < numBits
175 && bits.get(pp2.rptInd)) {
176 bits.clear(pp2.rptInd);
177 }
178 }
179
180 for (int i=n-1; i>=0; i--) {
181 pq.add(rptStack[i]);
182 }
183 return true;
184 }
185
186
187 private PhrasePositions lesser(PhrasePositions pp, PhrasePositions pp2) {
188 if (pp.position < pp2.position ||
189 (pp.position == pp2.position && pp.offset < pp2.offset)) {
190 return pp;
191 }
192 return pp2;
193 }
194
195
196 private int collide(PhrasePositions pp) {
197 int tpPos = tpPos(pp);
198 PhrasePositions[] rg = rptGroups[pp.rptGroup];
199 for (int i=0; i<rg.length; i++) {
200 PhrasePositions pp2 = rg[i];
201 if (pp2 != pp && tpPos(pp2) == tpPos) {
202 return pp2.rptInd;
203 }
204 }
205 return -1;
206 }
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223 private boolean initPhrasePositions() throws IOException {
224 end = Integer.MIN_VALUE;
225 if (!checkedRpts) {
226 return initFirstTime();
227 }
228 if (!hasRpts) {
229 initSimple();
230 return true;
231 }
232 return initComplex();
233 }
234
235
236 private void initSimple() throws IOException {
237
238 pq.clear();
239
240 for (PhrasePositions pp : phrasePositions) {
241 pp.firstPosition();
242 if (pp.position > end) {
243 end = pp.position;
244 }
245 pq.add(pp);
246 }
247 }
248
249
250 private boolean initComplex() throws IOException {
251
252 placeFirstPositions();
253 if (!advanceRepeatGroups()) {
254 return false;
255 }
256 fillQueue();
257 return true;
258 }
259
260
261 private void placeFirstPositions() throws IOException {
262 for (PhrasePositions pp : phrasePositions) {
263 pp.firstPosition();
264 }
265 }
266
267
268 private void fillQueue() {
269 pq.clear();
270 for (PhrasePositions pp : phrasePositions) {
271 if (pp.position > end) {
272 end = pp.position;
273 }
274 pq.add(pp);
275 }
276 }
277
278
279
280
281
282
283
284
285
286
287 private boolean advanceRepeatGroups() throws IOException {
288 for (PhrasePositions[] rg: rptGroups) {
289 if (hasMultiTermRpts) {
290
291 int incr;
292 for (int i=0; i<rg.length; i+=incr) {
293 incr = 1;
294 PhrasePositions pp = rg[i];
295 int k;
296 while((k=collide(pp)) >= 0) {
297 PhrasePositions pp2 = lesser(pp, rg[k]);
298 if (!advancePP(pp2)) {
299 return false;
300 }
301 if (pp2.rptInd < i) {
302 incr = 0;
303 break;
304 }
305 }
306 }
307 } else {
308
309 for (int j=1; j<rg.length; j++) {
310 for (int k=0; k<j; k++) {
311 if (!rg[j].nextPosition()) {
312 return false;
313 }
314 }
315 }
316 }
317 }
318 return true;
319 }
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335 private boolean initFirstTime() throws IOException {
336
337 checkedRpts = true;
338 placeFirstPositions();
339
340 LinkedHashMap<Term,Integer> rptTerms = repeatingTerms();
341 hasRpts = !rptTerms.isEmpty();
342
343 if (hasRpts) {
344 rptStack = new PhrasePositions[numPostings];
345 ArrayList<ArrayList<PhrasePositions>> rgs = gatherRptGroups(rptTerms);
346 sortRptGroups(rgs);
347 if (!advanceRepeatGroups()) {
348 return false;
349 }
350 }
351
352 fillQueue();
353 return true;
354 }
355
356
357
358 private void sortRptGroups(ArrayList<ArrayList<PhrasePositions>> rgs) {
359 rptGroups = new PhrasePositions[rgs.size()][];
360 Comparator<PhrasePositions> cmprtr = new Comparator<PhrasePositions>() {
361 @Override
362 public int compare(PhrasePositions pp1, PhrasePositions pp2) {
363 return pp1.offset - pp2.offset;
364 }
365 };
366 for (int i=0; i<rptGroups.length; i++) {
367 PhrasePositions[] rg = rgs.get(i).toArray(new PhrasePositions[0]);
368 Arrays.sort(rg, cmprtr);
369 rptGroups[i] = rg;
370 for (int j=0; j<rg.length; j++) {
371 rg[j].rptInd = j;
372 }
373 }
374 }
375
376
377 private ArrayList<ArrayList<PhrasePositions>> gatherRptGroups(LinkedHashMap<Term,Integer> rptTerms) throws IOException {
378 PhrasePositions[] rpp = repeatingPPs(rptTerms);
379 ArrayList<ArrayList<PhrasePositions>> res = new ArrayList<>();
380 if (!hasMultiTermRpts) {
381
382 for (int i=0; i<rpp.length; i++) {
383 PhrasePositions pp = rpp[i];
384 if (pp.rptGroup >=0) continue;
385 int tpPos = tpPos(pp);
386 for (int j=i+1; j<rpp.length; j++) {
387 PhrasePositions pp2 = rpp[j];
388 if (
389 pp2.rptGroup >=0
390 || pp2.offset == pp.offset
391 || tpPos(pp2) != tpPos) {
392 continue;
393 }
394
395 int g = pp.rptGroup;
396 if (g < 0) {
397 g = res.size();
398 pp.rptGroup = g;
399 ArrayList<PhrasePositions> rl = new ArrayList<>(2);
400 rl.add(pp);
401 res.add(rl);
402 }
403 pp2.rptGroup = g;
404 res.get(g).add(pp2);
405 }
406 }
407 } else {
408
409 ArrayList<HashSet<PhrasePositions>> tmp = new ArrayList<>();
410 ArrayList<FixedBitSet> bb = ppTermsBitSets(rpp, rptTerms);
411 unionTermGroups(bb);
412 HashMap<Term,Integer> tg = termGroups(rptTerms, bb);
413 HashSet<Integer> distinctGroupIDs = new HashSet<>(tg.values());
414 for (int i=0; i<distinctGroupIDs.size(); i++) {
415 tmp.add(new HashSet<PhrasePositions>());
416 }
417 for (PhrasePositions pp : rpp) {
418 for (Term t: pp.terms) {
419 if (rptTerms.containsKey(t)) {
420 int g = tg.get(t);
421 tmp.get(g).add(pp);
422 assert pp.rptGroup==-1 || pp.rptGroup==g;
423 pp.rptGroup = g;
424 }
425 }
426 }
427 for (HashSet<PhrasePositions> hs : tmp) {
428 res.add(new ArrayList<>(hs));
429 }
430 }
431 return res;
432 }
433
434
435 private final int tpPos(PhrasePositions pp) {
436 return pp.position + pp.offset;
437 }
438
439
440 private LinkedHashMap<Term,Integer> repeatingTerms() {
441 LinkedHashMap<Term,Integer> tord = new LinkedHashMap<>();
442 HashMap<Term,Integer> tcnt = new HashMap<>();
443 for (PhrasePositions pp : phrasePositions) {
444 for (Term t : pp.terms) {
445 Integer cnt0 = tcnt.get(t);
446 Integer cnt = cnt0==null ? new Integer(1) : new Integer(1+cnt0.intValue());
447 tcnt.put(t, cnt);
448 if (cnt==2) {
449 tord.put(t,tord.size());
450 }
451 }
452 }
453 return tord;
454 }
455
456
457 private PhrasePositions[] repeatingPPs(HashMap<Term,Integer> rptTerms) {
458 ArrayList<PhrasePositions> rp = new ArrayList<>();
459 for (PhrasePositions pp : phrasePositions) {
460 for (Term t : pp.terms) {
461 if (rptTerms.containsKey(t)) {
462 rp.add(pp);
463 hasMultiTermRpts |= (pp.terms.length > 1);
464 break;
465 }
466 }
467 }
468 return rp.toArray(new PhrasePositions[0]);
469 }
470
471
472 private ArrayList<FixedBitSet> ppTermsBitSets(PhrasePositions[] rpp, HashMap<Term,Integer> tord) {
473 ArrayList<FixedBitSet> bb = new ArrayList<>(rpp.length);
474 for (PhrasePositions pp : rpp) {
475 FixedBitSet b = new FixedBitSet(tord.size());
476 Integer ord;
477 for (Term t: pp.terms) {
478 if ((ord=tord.get(t))!=null) {
479 b.set(ord);
480 }
481 }
482 bb.add(b);
483 }
484 return bb;
485 }
486
487
488 private void unionTermGroups(ArrayList<FixedBitSet> bb) {
489 int incr;
490 for (int i=0; i<bb.size()-1; i+=incr) {
491 incr = 1;
492 int j = i+1;
493 while (j<bb.size()) {
494 if (bb.get(i).intersects(bb.get(j))) {
495 bb.get(i).or(bb.get(j));
496 bb.remove(j);
497 incr = 0;
498 } else {
499 ++j;
500 }
501 }
502 }
503 }
504
505
506 private HashMap<Term,Integer> termGroups(LinkedHashMap<Term,Integer> tord, ArrayList<FixedBitSet> bb) throws IOException {
507 HashMap<Term,Integer> tg = new HashMap<>();
508 Term[] t = tord.keySet().toArray(new Term[0]);
509 for (int i=0; i<bb.size(); i++) {
510 FixedBitSet bits = bb.get(i);
511 for (int ord = bits.nextSetBit(0); ord != DocIdSetIterator.NO_MORE_DOCS; ord = ord + 1 >= bits.length() ? DocIdSetIterator.NO_MORE_DOCS : bits.nextSetBit(ord + 1)) {
512 tg.put(t[ord],i);
513 }
514 }
515 return tg;
516 }
517
518 @Override
519 public int freq() {
520 return numMatches;
521 }
522
523 float sloppyFreq() {
524 return sloppyFreq;
525 }
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549 @Override
550 public int docID() {
551 return conjunction.docID();
552 }
553
554 @Override
555 public int nextDoc() throws IOException {
556 int doc;
557 for (doc = conjunction.nextDoc(); doc != NO_MORE_DOCS; doc = conjunction.nextDoc()) {
558 sloppyFreq = phraseFreq();
559 if (sloppyFreq != 0f) {
560 break;
561 }
562 }
563
564 return doc;
565 }
566
567 @Override
568 public float score() {
569 return docScorer.score(docID(), sloppyFreq);
570 }
571
572 @Override
573 public int advance(int target) throws IOException {
574 assert target > docID();
575 int doc;
576 for (doc = conjunction.advance(target); doc != NO_MORE_DOCS; doc = conjunction.nextDoc()) {
577 sloppyFreq = phraseFreq();
578 if (sloppyFreq != 0f) {
579 break;
580 }
581 }
582
583 return doc;
584 }
585
586 @Override
587 public long cost() {
588 return conjunction.cost();
589 }
590
591 @Override
592 public String toString() { return "scorer(" + weight + ")"; }
593
594 @Override
595 public TwoPhaseIterator asTwoPhaseIterator() {
596 return new TwoPhaseIterator(conjunction) {
597 @Override
598 public boolean matches() throws IOException {
599 sloppyFreq = phraseFreq();
600 return sloppyFreq != 0F;
601 }
602
603 @Override
604 public float matchCost() {
605 return matchCost;
606 }
607
608 @Override
609 public String toString() {
610 return "SloppyPhraseScorer@asTwoPhaseIterator(" + SloppyPhraseScorer.this + ")";
611 }
612 };
613 }
614 }